Import Packages and Set Up Environment

# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

# Installing Packages: 

#  BiocManager::install("oligo")
#  BiocManager::install("limma")
#  BiocManager::install("biomaRt")
#  BiocManager::install("AnnotationDbi")
#  BiocManager::install("Biobase")

# Importing Libraries

#  library(oligo)
#  library(limma)
#  library(biomaRt)
#  library(dplyr)
#  library(tibble)
#  library(methods)
#  library(AnnotationDbi)
#  library(Biobase)

Import Raw Data

    #Reading in runsheet with metadata

csv_file <- read.table(file = r"(C:\Users\linde\rmarkdown\affy_processing\data_runsheets\GLDS_6_runsheet.csv)", 
                       sep=",",
                       header = TRUE,
                       check.names = FALSE
                       )

    #Accessing arrray data files located in column name 'Path to Raw Data File'

array_data_files <- csv_file["Path to Raw Data File"]

    #Importing and read *.CEL files using oligo function 'read.celfiles()'

raw_data <- oligo::read.celfiles(array_data_files) 
## Platform design info loaded.
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep1_GSM144733_raw1.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep2_GSM144734_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep3_GSM144735_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep1_GSM144736_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep2_GSM144737_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep3_GSM144738_raw.CEL.gz
#'raw_data' type - ExpressionFeatureSet Class

#   Parameter Definitions:

#file – path of runsheet file to be accessed
#sep - setting delimiter, default is white space
#header - determines if first row in dataframe is column names
#check.names - if TRUE, variable names are checked and corrected for synaticially validity

Quality Analysis

QA: Visualization

#Assess quality of raw data through visualization of plots

# First plot: density plot
# hist() function plots the density estimates for each sample
# Purpose: Determine technical variation between samples

density_plot <- hist(raw_data, transfo=log2, which=c("pm", "mm", "bg", "both", "all"),
                    nsample=10000, target = "core", main = "Density of raw intensities for multiple arrays")

## TODO: Add sample legend   
# Second Plot: Pseudoimage
# image() function produces a pseudoimage for each array
# Purpose: Identify spatial abnormalities

pseudoimage <- image(raw_data, transfo=log2)

# Third Plot: MA plot
# MAplot() function creates MA plots using a reference array
# Purpose: Comparison of intensity values from one sample to another

MA_plot <- MAplot(raw_data, ylim=c(-2, 4))

#   Parameter Definitions:

# transfo - used to scale the data
# which - defines specific probe types
# nsample - sample size used to produce plot
# target - specifies group of meta-probeset
# main - main title 
# ylim - scale for y axis

QA: Statistics

IQRray

# The IQRray statistic is obtained by ranking all the probe intensities from
# a given array and by computing the average rank for each probe set. The
# interquartile range (IQR) of the probe sets average ranks serves then as
# quality score. 
    #data - ExpressionFeatureSet object obtained after reading in celfiles into R 
    
    #obtaining intensity values for perfect match (pm) probes

pm_data<-pm(raw_data)
    
    #ranking probe intensities for every array 

rank_data<-apply(pm_data,2,rank)
    
    #obtaining names of probeset for every probe

probeNames<-probeNames(raw_data)
    
    #function computing IQR of mean probe ranks in probesets 

get_IQR<-function(rank_data,probeNames){round(IQR(sapply(split(rank_data,probeNames),mean)),digits=2)}
    
    #computing arIQR score

IQRray_score<-apply(rank_data,2,get_IQR,probeNames=probeNames)  

Normalize Data

#Probe Level normalization

    #Converting raw data from ExpressionFeatureSet object to matrix 

raw_table <- exprs(raw_data)

    #Performing background correction

backgroundCorrection <- preprocessCore::rma.background.correct(raw_table)

    #Performing Normalization using Quantile Normalization

probelevel_table <- preprocessCore::normalize.quantiles(backgroundCorrection, keep.names=TRUE)

    #Writing probe level normalization to file

#write.csv(probelevel_table, r"(C:\Users\linde\rmarkdown\notebook\probelevel_table.csv)")
#Probeset Level Normalization

    #RMA (Robust Multi-array Average) is most commonly used method for preprocessing normalization of 
    #Affymetrix microarray data. There are three steps in the RMA algorithm: Convolution Background 
    #Correction, Quantile Normalization, and Summarization using Median Polish.

    #Performing RMA

normRMA <- rma(raw_data)
## Background correcting
## Normalizing
## Calculating Expression
    #Converting normalized data to a matrix for easier accessibility 

norm_probeset_table <- exprs(normRMA)  

    #Writing probeset level normalization to file

#write.csv(norm_probeset_table, r"(C:\Users\linde\rmarkdown\notebook\norm_probeset_table.csv)")

Quality Analysis Visualization for Normalized Data

#QA to ensure normalization worked properly
#Samples should appear more similar to each other after normalization
#Same types of plots as QA of raw data for easy comparison

MAplot_norm <- MAplot(normRMA, ylim=c(-2, 4))

densityPlot_norm <- hist(normRMA, main = "Density of raw intensities for multiple arrays")

Differential Expression (DE) Analysis

## Getting Factor Values
    #Creating a list of Factor Values

fv <- data.frame(csv_file["Factor Value"])

all_factor_values <- fv[,1]

    #Obtaining the name of each factor value

factor_values <- unique(all_factor_values)

    #Making the names for both lists syntactically valid

all.factor.values <- make.names(all_factor_values)

factor.values <- make.names(factor_values)
## Creating Design Matrix
    #Adding factor values to phenoData

ph = raw_data@phenoData

ph@data[ ,2] = c(all.factor.values)

colnames(ph@data)[2]="source"

    #Grouping factor values

groups = ph@data$source

    #Transforming group names into statistical factors

f = factor(groups,levels = factor.values)

    #Creating matrix of values of the grouping variable

design = model.matrix(~ 0 + f) 

colnames(design) <- factor.values
## Forming Constrast Matrix
    #Fit design matrix to a linear model using lmFit()

fit = limma::lmFit(norm_probeset_table,design)

fit.groups <- colnames(fit$design) [which(fit$assign == 1)]

fit.index <- which(levels(levels) %in% fit.groups)

    #Creating comparisons for contrast matrix

fit.group.names <- gsub(" ", "_", sub(",", "_", unique(csv_file$'Factor Value')))

combos <- combn(fit.groups, 2)

combos.names <- combn(fit.group.names, 2)

contrasts <- c(paste(combos[1,], combos[2,], sep = "-", paste(combos[2,], combos[1,], sep="-")))

contrast.names <- c(paste(combos.names[1,], combos.names[2,], sep = "v"), paste(combos.names[2,], 
                    combos.names[1,], sep = "v"))

    #Creating Contrast Matrix

cont.matrix <- limma::makeContrasts(contrasts = contrasts, levels=design)

contrast.fit <- limma::contrasts.fit(fit, cont.matrix)
## Testing and Accessing Results
    #Performing statistical t-test using eBayes()

contrast.fit.eb <- limma::eBayes(contrast.fit)

    #Extracting log fold change values

log_fold_change <- contrast.fit.eb$coefficients

    #Extracting p-values

p_value <- contrast.fit.eb$p.value

    #Changing names

log.fold.change <- c(log_fold_change) 

p.value <- c(p_value)

    #Adjusting the p-values using the Benjamini-Hochberg methods

adjusted_pvalue <- p.adjust(p_value,method="BH")

    #Combining normalized expressions table with test results

de_dataframe <- cbind(norm_probeset_table, log.fold.change, p.value, adjusted_pvalue)

##TODO debug decideTests and summary functions
    ##This part is only for averaging the expression values for each factor value group
# for glds 6
# TODO: figure out how to separate factor values automatically
non_irradiated <- norm_probeset_table[,c(1,2,3)]
ion_beam_radiation <- norm_probeset_table[,c(4,5,6)]
avg_fv_6 <- data.frame(rowMeans(non_irradiated), rowMeans(ion_beam_radiation))
de_dataframe <- cbind(norm_probeset_table, log.fold.change, p.value, adjusted_pvalue, avg_fv_6)

## TODO: make dataframe for glds 189

Gene Annotations

## GLDS 6
    #Searching Ensembl 
    
listEnsembl()
##         biomart                version
## 1         genes      Ensembl Genes 107
## 2 mouse_strains      Mouse strains 107
## 3          snps  Ensembl Variation 107
## 4    regulation Ensembl Regulation 107
    #Selecting mart object pointing to the database

ensembl <- useEnsembl(biomart = "genes")

    #Searching datasets in this database
searchDatasets(mart = ensembl, pattern = "Rat") 
##                      dataset           description   version
## 167 rnorvegicus_gene_ensembl Rat genes (mRatBN7.2) mRatBN7.2
    #Selecting dataset

ensembl <- useDataset(dataset = "rnorvegicus_gene_ensembl", mart = ensembl)

    #Finding the microarray probe ID attribute
    
searchAttributes(mart = ensembl, pattern = "affy_")
##                      name                 description         page
## 91           affy_rae230a          AFFY RAE230A probe feature_page
## 92           affy_rae230b          AFFY RAE230B probe feature_page
## 93    affy_raex_1_0_st_v1   AFFY RaEx 1 0 st v1 probe feature_page
## 94  affy_ragene_1_0_st_v1 AFFY RaGene 1 0 st v1 probe feature_page
## 95  affy_ragene_2_1_st_v1 AFFY RaGene 2 1 st v1 probe feature_page
## 96          affy_rat230_2         AFFY Rat230 2 probe feature_page
## 97           affy_rg_u34a          AFFY RG U34A probe feature_page
## 98           affy_rg_u34b          AFFY RG U34B probe feature_page
## 99           affy_rg_u34c          AFFY RG U34C probe feature_page
## 100           affy_rn_u34           AFFY RN U34 probe feature_page
## 101           affy_rt_u34           AFFY RT U34 probe feature_page
    #Generate mapped dataframe using getBM()

my_affy_ids <- rownames(norm_probeset_table)
gene_annotations <- getBM(
    attributes = c(
        "affy_rat230_2",
        "ensembl_gene_id",
        "uniprot_gn_symbol",
        "go_id"
        ), 
    filters = "affy_rat230_2", 
    values = my_affy_ids,  
    mart = ensembl)

#   Parameter Definitions:
#attributes - desired attributes extracted from specified database
#filters - filters to be used in the query
#values - query IDs
#mart - object connected to desird database and dataset
    #Converting multimappings from multiple rows into list fields

gene_annotations <- gene_annotations %>% 

        #grouping by probe ID

         group_by(affy_rat230_2) %>% 

         summarise( 

            #Convert uniprot_gn_symbol to a string of unique values in the group

            SYMBOL = toString(unique(uniprot_gn_symbol)), 

            #Convert Ensembl IDs to a string of unique IDs in the group

            ENSEMBL = toString(unique(ensembl_gene_id)), 

            #Convert go_ids to a string of unique IDs in the group

            GO_Ids = toString(unique(go_id))
            ) 

Creating Final Table

#glds6

    #Joining tables to map to items in DE dataframe using left.join()

        #Creating a column with probe IDS in DE dataframe

final_dataframe <- rownames_to_column(de_dataframe, var="probesets") %>% left_join(gene_annotations, by = c("probesets" = "affy_rat230_2"))

    #Joining using probe IDS as common factor
## Writing Final table to file
#write.csv(final_dataframe, r"(C:\Users\linde\rmarkdown\notebook\GLDS_6_DGE.csv)")
#write.csv(final_dataframe, r"(C:\Users\linde\rmarkdown\notebook\GLDS_25_DGE.csv)")